{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install datasets\n",
    "# !pip install pandas\n",
    "# !pip install pillow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3cdd739c57f44d7bb7fcc0726badde67",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "from datasets import load_dataset\n",
    "from tqdm import tqdm\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import requests\n",
    "from PIL import Image\n",
    "from io import BytesIO\n",
    "from tqdm import tqdm\n",
    "import logging\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "de04d414d43a4b28b8c2aff67095b5f0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading readme:   0%|          | 0.00/633 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "33c412208a3d4310b2e1d8e59a5210bd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0d435a3465294d47bc86556b5af5565a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split:   0%|          | 0/12683 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "image_dataset = load_dataset(\"CognitiveLab/image-prompts-raw\")\n",
    "\n",
    "# image_dataset.save_to_disk(\"path/to/dataset/directory\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['negativePrompt', 'steps', 'id', 'sampler', 'imageUrl', 'cfgScale', 'clipSkip', 'url', 'model', 'prompt', 'seed'],\n",
       "    num_rows: 12683\n",
       "})"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "image_dataset[\"train\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>negativePrompt</th>\n",
       "      <th>steps</th>\n",
       "      <th>id</th>\n",
       "      <th>sampler</th>\n",
       "      <th>imageUrl</th>\n",
       "      <th>cfgScale</th>\n",
       "      <th>clipSkip</th>\n",
       "      <th>url</th>\n",
       "      <th>model</th>\n",
       "      <th>prompt</th>\n",
       "      <th>seed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>nsfw, paintings, sketches, (worst quality:2), ...</td>\n",
       "      <td>25</td>\n",
       "      <td>127497</td>\n",
       "      <td>DPM++ SDE Karras</td>\n",
       "      <td>https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...</td>\n",
       "      <td>12</td>\n",
       "      <td>2</td>\n",
       "      <td>https://civitai.com/images/127497</td>\n",
       "      <td>chilloutmix_NiPrunedFp32Fix</td>\n",
       "      <td>best quality, (photorealistic:1.4), 1girl, (bl...</td>\n",
       "      <td>3118275117</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>paintings, sketches, (worst quality:2), (low q...</td>\n",
       "      <td>30</td>\n",
       "      <td>295008</td>\n",
       "      <td>DPM++ SDE Karras</td>\n",
       "      <td>https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>https://civitai.com/images/295008</td>\n",
       "      <td>chilloutmix_NiPrunedFp32Fix</td>\n",
       "      <td>(RAW photo, best quality), (realistic, photo-r...</td>\n",
       "      <td>1684432449</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>(worst quality, low quality, illustration, 3d,...</td>\n",
       "      <td>46</td>\n",
       "      <td>2008378</td>\n",
       "      <td>DPM++ 2M Karras</td>\n",
       "      <td>https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...</td>\n",
       "      <td>7</td>\n",
       "      <td></td>\n",
       "      <td>https://civitai.com/images/2008378</td>\n",
       "      <td>sdvn6Realxl_detailface</td>\n",
       "      <td>Amazing detailed photography of a perfect beau...</td>\n",
       "      <td>2814388551</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>(worst quality:1.4),(low quality:1.4), NSFW,cr...</td>\n",
       "      <td>22</td>\n",
       "      <td>3140516</td>\n",
       "      <td>DPM++ 2M Karras</td>\n",
       "      <td>https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...</td>\n",
       "      <td>8</td>\n",
       "      <td></td>\n",
       "      <td>https://civitai.com/images/3140516</td>\n",
       "      <td></td>\n",
       "      <td>(masterpiece, best quality,hires,high resoluti...</td>\n",
       "      <td>1105145580557338</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>(worst quality:1.4), (low quality:1.4), (monoc...</td>\n",
       "      <td>20</td>\n",
       "      <td>256477</td>\n",
       "      <td>DPM++ 2M Karras</td>\n",
       "      <td>https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>https://civitai.com/images/256477</td>\n",
       "      <td>luckyStrikeMix_V02Realistic</td>\n",
       "      <td>(RAW photo, best quality), (realistic, photo-r...</td>\n",
       "      <td>3589893419</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      negativePrompt steps       id  \\\n",
       "0  nsfw, paintings, sketches, (worst quality:2), ...    25   127497   \n",
       "1  paintings, sketches, (worst quality:2), (low q...    30   295008   \n",
       "2  (worst quality, low quality, illustration, 3d,...    46  2008378   \n",
       "3  (worst quality:1.4),(low quality:1.4), NSFW,cr...    22  3140516   \n",
       "4  (worst quality:1.4), (low quality:1.4), (monoc...    20   256477   \n",
       "\n",
       "            sampler                                           imageUrl  \\\n",
       "0  DPM++ SDE Karras  https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...   \n",
       "1  DPM++ SDE Karras  https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...   \n",
       "2   DPM++ 2M Karras  https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...   \n",
       "3   DPM++ 2M Karras  https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...   \n",
       "4   DPM++ 2M Karras  https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...   \n",
       "\n",
       "  cfgScale clipSkip                                 url  \\\n",
       "0       12        2   https://civitai.com/images/127497   \n",
       "1        7        2   https://civitai.com/images/295008   \n",
       "2        7           https://civitai.com/images/2008378   \n",
       "3        8           https://civitai.com/images/3140516   \n",
       "4        7        2   https://civitai.com/images/256477   \n",
       "\n",
       "                         model  \\\n",
       "0  chilloutmix_NiPrunedFp32Fix   \n",
       "1  chilloutmix_NiPrunedFp32Fix   \n",
       "2       sdvn6Realxl_detailface   \n",
       "3                                \n",
       "4  luckyStrikeMix_V02Realistic   \n",
       "\n",
       "                                              prompt              seed  \n",
       "0  best quality, (photorealistic:1.4), 1girl, (bl...        3118275117  \n",
       "1  (RAW photo, best quality), (realistic, photo-r...        1684432449  \n",
       "2  Amazing detailed photography of a perfect beau...        2814388551  \n",
       "3  (masterpiece, best quality,hires,high resoluti...  1105145580557338  \n",
       "4  (RAW photo, best quality), (realistic, photo-r...        3589893419  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "pandas_data = pd.DataFrame(image_dataset[\"train\"])\n",
    "\n",
    "pandas_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "image_url = \"https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/7e1b47b4-cfe2-4b33-2c9d-6bc58031d500/width=768/7e1b47b4-cfe2-4b33-2c9d-6bc58031d500.jpeg\"\n",
    "\n",
    "response = requests.get(image_url)\n",
    "response.raise_for_status()  # Check for errors in the HTTP request\n",
    "\n",
    "# Open the image using PIL\n",
    "image = Image.open(BytesIO(response.content))\n",
    "\n",
    "# Save the image to the specified path\n",
    "image.save(f'data/{1}.jpg', 'JPEG')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```json\n",
    "[\n",
    "  {\n",
    "    \"id\": \"{id}\",\n",
    "    \"image\": \"data/{id}.jpg\",\n",
    "    \"conversations\": [\n",
    "      {\n",
    "        \"from\": \"human\",\n",
    "        \"value\": \"<image>\\nWrite a prompt for Stable Diffusion to generate this image.\"\n",
    "      },\n",
    "      {\n",
    "        \"from\": \"gpt\",\n",
    "        \"value\": \"{prompt}\"\n",
    "      },\n",
    "    ]\n",
    "  },\n",
    "]\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set up logging\n",
    "logging.basicConfig(filename='download_log.txt', level=logging.ERROR)\n",
    "\n",
    "# Assuming your DataFrame is named 'pandas_data'\n",
    "# Replace 'pandas_data' with your actual DataFrame name\n",
    "\n",
    "for index, row in tqdm(pandas_data.iterrows(), total=len(pandas_data), desc=\"Downloading Images\"):\n",
    "    image_url = row['imageUrl']\n",
    "    image_id = row['id']\n",
    "    \n",
    "    try:\n",
    "        response = requests.get(image_url)\n",
    "        response.raise_for_status()  # Check for errors in the HTTP request\n",
    "\n",
    "        # Open the image using PIL\n",
    "        image = Image.open(BytesIO(response.content))\n",
    "\n",
    "        # Save the image to the specified path\n",
    "        image.save(f'data/{image_id}.jpg', 'JPEG')\n",
    "        # tqdm.write(f\"Image {image_id} downloaded successfully.\")\n",
    "    except Exception as e:\n",
    "        error_message = f\"Error downloading image {image_id}: {str(e)}\"\n",
    "        tqdm.write(error_message)\n",
    "        \n",
    "        # Log the exception to the specified file\n",
    "        logging.error(f\"Image ID: {image_id}, URL: {image_url}\\n{error_message}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Formatted data saved to ./data/formatted_data.json\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "# Assuming your DataFrame is named 'pandas_data'\n",
    "# Replace 'pandas_data' with your actual DataFrame name\n",
    "\n",
    "# Convert DataFrame to JSON\n",
    "json_data = pandas_data.to_json(orient='records', lines=False)\n",
    "data_list = json.loads(json_data)\n",
    "\n",
    "# Create a new list to store the formatted data\n",
    "formatted_data = []\n",
    "\n",
    "for record in data_list:\n",
    "    \n",
    "    formatted_prompt = {\n",
    "                \"prompt\" : record[\"prompt\"],\n",
    "                \"negativePrompt\" : record[\"negativePrompt\"],\n",
    "                \"steps\" : record[\"steps\"],\n",
    "                \"sampler\" : record[\"sampler\"],\n",
    "                \"model\" : record[\"model\"],\n",
    "                \"cfgScale\" : record[\"cfgScale\"],\n",
    "                \"clipSkip\" : record[\"clipSkip\"],\n",
    "                }\n",
    "    \n",
    "    formatted_record = {\n",
    "        \"id\": record[\"id\"],\n",
    "        \"image\": f\"data/{record['id']}.jpg\",\n",
    "        \"conversations\": [\n",
    "            {\"from\": \"human\", \"value\": f\"<image>\\n Write a prompt for Stable Diffusion to generate this image in json formate :\"},\n",
    "            {\"from\": \"gpt\", \"value\": f\"\"\"{formatted_prompt}\"\"\"}\n",
    "        ]\n",
    "    }\n",
    "    formatted_data.append(formatted_record)\n",
    "\n",
    "# Save the formatted data to a JSON file\n",
    "output_file_path = './data/formatted_data.json'\n",
    "with open(output_file_path, 'w') as json_file:\n",
    "    json.dump(formatted_data, json_file, indent=2)\n",
    "\n",
    "print(f\"Formatted data saved to {output_file_path}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0b3a708df0074bdd9d1b2790cd6b3709",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "formatted_dataset = load_dataset(\"json\", data_files=\"./formatted_data.json\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "data-venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
